Adapted from the course material provided by the Institute for Systems Biology, Seattle, Washington, USA.
This project uses gene expression data from the Cancer Genome Atlas (TGCA) Pan-Cancer Atlas Consortium.
# Load packages for analysis
packages <- c(
# set of packages for easy manipulation of data (Author: Hadley Wickam http://hadley.nz/)
"tidyverse",
"magrittr",
"ConsensusClusterPlus",
"sigclust",
"cluster",
"RColorBrewer", # library for east access to multiple colors for plotting
"colorRamps",
"pheatmap",
"gplots",
"survival",
"GEOquery",
"Rtsne",
"knitr"
)
# Check that all required packages are installed i.e. require("ConsensusClusterPlus") and loaded
# sapply -- apply a function to all the elements of a vector like object
sapply(X = packages, FUN = require, character.only = TRUE, quietly=TRUE)
## tidyverse magrittr ConsensusClusterPlus
## TRUE TRUE TRUE
## sigclust cluster RColorBrewer
## TRUE TRUE TRUE
## colorRamps pheatmap gplots
## TRUE TRUE TRUE
## survival GEOquery Rtsne
## TRUE TRUE TRUE
## knitr
## TRUE
# data directory
dataDirectory <- "data"
# results directory
resultsDirectory <- "results"
dataFilePath <- file.path(dataDirectory, "pancan_dataset", "BRCA", "gexp_BRCA_pancan.csv")
print(dataFilePath)
## [1] "data/pancan_dataset/BRCA/gexp_BRCA_pancan.csv"
# load file into environment
gexp = read.csv(dataFilePath, header=T, row.names=1) # baseR
Perform some standard exploratory data analysis and perform quick quality control checks by viewing the data.
# View snippet of the expression matrix
gexp[1:10,1:5]
## TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.3C.AALK TCGA.4H.AAAK
## COL1A1 55574.20 179261.000 180821.00 434557.0 413984.0
## FN1 46116.70 106743.000 120646.00 136341.0 244774.0
## COL1A2 34666.80 98721.600 94717.10 236808.0 298457.0
## COL3A1 31598.70 71842.800 69689.00 196171.0 239839.0
## ACTB 41867.10 107063.000 110079.00 98775.8 94600.9
## EEF1A1 30241.40 32369.800 57191.30 101169.0 89656.2
## SPARC 21273.00 41839.600 50817.80 115850.0 127788.0
## MGP 1409.65 742.251 7568.45 44747.6 40891.5
## XBP1 39531.00 33026.100 18007.30 66764.6 52518.3
## ACTG1 53008.50 91177.800 101164.00 177436.0 79073.2
Check that there are no NA values.
print(sum(colSums(is.na(gexp))))
## [1] 0
How many samples are in the dataset?
print(paste0("Number of samples: ", dim(gexp)[[2]]))
## [1] "Number of samples: 1082"
print(paste0("Number of genes: ", dim(gexp)[[1]]))
## [1] "Number of genes: 10000"
Visualize the distribution of expression values for each sample by plotting boxplots and density plots to check for outliers and see the overall distribution of the data.
# Boxplots of expression values to check data normlization
# Create a random set of n samples
n <- 50
gexp_rand_sample <- gexp[,sample(1:dim(gexp)[2], n)]
# convert data to long format
# %>% Pipe operator - used to chain commands together
gexp_long <- gexp_rand_sample %>%
mutate(GeneName=rownames(gexp_rand_sample)) %>%
gather("TGCA_Participant_Barcode", "Expression_Level", 1:dim(gexp_rand_sample)[[2]])
gexp_long[1:10,]
## GeneName TGCA_Participant_Barcode Expression_Level
## 1 COL1A1 TCGA.OL.A5RW 68900.600
## 2 FN1 TCGA.OL.A5RW 34764.400
## 3 COL1A2 TCGA.OL.A5RW 42684.100
## 4 COL3A1 TCGA.OL.A5RW 31610.200
## 5 ACTB TCGA.OL.A5RW 114515.000
## 6 EEF1A1 TCGA.OL.A5RW 67054.200
## 7 SPARC TCGA.OL.A5RW 26879.000
## 8 MGP TCGA.OL.A5RW 557.509
## 9 XBP1 TCGA.OL.A5RW 2060.770
## 10 ACTG1 TCGA.OL.A5RW 48464.100
# Visualize using a boxplot with ggplot
gexp_long %>%
ggplot(aes(x=TGCA_Participant_Barcode, y=Expression_Level)) +
geom_boxplot() + # create boxplot using ggplot
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # rotate x-axis label
gexp_long %>%
ggplot(aes(x=Expression_Level, color=TGCA_Participant_Barcode)) +
geom_density() +
labs(title=paste0("mRNA Expression Distribution in Cohort\n Random Sample (n=", n, ")")) +
theme(legend.position="none") +
xlim(0,3000)
## Warning: Removed 61686 rows containing non-finite values (stat_density).
Clearly the data is heavily skewed. Perform a log transformation to scale the data, and then median center each gene’s expression for use in consensus clustering by subtracting the median expression of each gene (row median) from expression in each sample. This mitigates magnitude differences between genes and sets the median to 0 for all genes.
# Perform log transform
gexp_log <- log2(gexp+1)
# Median center data
gexp_log_centered = sweep(gexp_log,1,apply(gexp_log,1,median,na.rm=T))
# Plot new density plot with transformed data
gexp_log_sample <- log2(gexp_rand_sample + 1)
gexp_log_centered_sample = sweep(gexp_log_sample,1,apply(gexp_log_sample,1,median,na.rm=T))
gexp_log_centered_sample %>%
mutate(GeneName=rownames(gexp_log_centered_sample)) %>%
gather("TGCA_Participant_Barcode", "Expression_Level", 1:dim(gexp_log_centered_sample)[[2]])%>%
ggplot(aes(x=Expression_Level, color=TGCA_Participant_Barcode)) +
geom_density() +
labs(title=paste0("mRNA Expression Distribution in Cohort\n Random Sample (n=", n, ")")) +
theme(legend.position="none")
Partition the cohort into a training and testing dataset.
set.seed(10)
hold_out = sample(colnames(gexp_log_centered), size = 350)
gexp_testing = gexp_log_centered[,hold_out] # testing cohort
dim(gexp_testing)
## [1] 10000 350
gexp_training = gexp_log_centered[,which(!(colnames(gexp_log_centered)%in%hold_out))] # training cohort
dim(gexp_training)
## [1] 10000 732
Here, we will use median absolute deviation (MAD) for feature selection.
# Calculate the MAD for each gene in the training dataset
mads = apply(gexp_training,1,mad)
head(mads[order(mads, decreasing = T)])
## SCGB2A2 HMGCS2 CYP4Z1 SCGB1D2 CYP2B7P1 PIP
## 5.824316 5.215207 5.095460 5.094898 4.949365 4.592107
tail(mads[order(mads, decreasing = T)])
## RAF1 HNRNPK KHDRBS1 WDR33 CIAO1 TARDBP
## 0.2581625 0.2450576 0.2247351 0.2210807 0.2199930 0.2018590
Select the most variable genes.
# select top k genes
top_n <- 2000
# subset matrix -- get the most variable genes (more informative for clustering)
gexp_training_most_var = gexp_training[order(mads,decreasing=T)[1:top_n],]
gexp_testing_most_var = gexp_testing[order(mads,decreasing=T)[1:top_n],]
Apply the function ConsensusClusterPlus to carry out consensus clustering for k 2……9.
For this analysis there are many parameter choices but the main ones are the following: - clustering algorithm - clusterAlg + ‘hc’ heirarchical (hclust) + ‘pam’ for paritioning around medoids + ‘km’ for k-means on data matrix + ‘kmdist’ for k-means on distance matrices or a function that returns a clustering. - distance metric - distance + ‘pearson’: (1 - Pearson correlation) + ‘spearman’ (1 - Spearman correlation) + ‘euclidean’ + ‘binary’ + ‘maximum’ + ‘canberra’ + ‘minkowski’ + custom distance function - linkage - the agglomeration method to be used + ward.D + ward.D2 + single + complete + average=UPGMA + mcquitty=WPGMA + median=WPGMC + centroid=UPGMC
#pdf(file.path(resultsDirectory, 'consensusClustering_training_brca.pdf')) # results stored here
#title='mRNA Expression Clustering of TCGA Breast Cancer Dataset'
results = ConsensusClusterPlus(as.matrix(gexp_training_most_var), maxK=9, reps=100, pItem=0.8, pFeature=1, clusterAlg="hc", distance="pearson", innerLinkage ="average" , finalLinkage = "average", seed=10)
## end fraction
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
#dev.off()
Next we perform analyses to determine if the discovered clusters are significantly separable using sigclust. Sigclust analyzes whether clusters are really there, using the 2-means (k = 2) clustering index as a statistic. It assesses the significance of clustering by simulation from a single null Gaussian distribution.
# Calculate pairwise significance of clusters for k clusters using sigclust
pairwise_cluster_significance <- function(k, expr_mat, cluster_labels){
#oldw <- getOption("warn") # turn off warnings for MLE estimation
#options(warn = -1)
result_mat <- matrix(nrow = k, ncol = k, dimnames=list(1:k,1:k))
corr_mat = cor(as.matrix(expr_mat),method = "pearson", use="pairwise.complete.obs")
for(i in 1:k) {
for(j in i:k) {
if(!i==j) {
tmp1 = names(which(cluster_labels==i)) # samples in cluster i
tmp2 = names(which(cluster_labels==j)) # samples in cluster j
lab1 = c(rep(1,length(tmp1)),rep(2,length(tmp2))) # label vector
# alternatively use correlation matrix in the sigclust function
tmp3 = sigclust(corr_mat[c(tmp1,tmp2),c(tmp1,tmp2)],nsim=100,labflag=1,label=lab1,
icovest = 3)
result_mat[i,j] = tmp3@pvalnorm
}
}
}
return(result_mat)
}
# correlation matrix for samples
corr_mat = cor(as.matrix(gexp_training_most_var), method = "pearson") # distance metric used for clustering
dissimilarity <- 1 - corr_mat
# Calculate pairwise significance of clusters for 3 clusters
k <- 3
# extract consensus clustering results - cluster labels
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
head(clusts) # vector with sample cluster membership for k solution
## TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.4H.AAAK TCGA.5L.AAT0
## 1 2 1 3 3
## TCGA.5T.A9QA
## 1
# Assess signficance of cluster separability
sigclust_mat_k3 <- pairwise_cluster_significance(k=3, expr_mat=gexp_training_most_var, cluster_labels = clusts)
sigclust_mat_k3
## 1 2 3
## 1 NA 3.477327e-191 7.391862e-51
## 2 NA NA 2.948449e-251
## 3 NA NA NA
image(x=1:3, y=1:3, sigclust_mat_k3, zlim=c(0,1), col=colorpanel(256,'blue','black','yellow'),
main='3 clusters', xlab='k', ylab='k')
# Compute silhouette information for 3 clusters
silhouette_k3 <- silhouette(x=clusts, dist = dissimilarity) # distance metric used in clustering)
plot(silhouette_k3, col = RColorBrewer::brewer.pal(3, "Set1"), border=NA, main="Silhouette plot of K=3", cex.names = 0.8)
# Repeat assessments for k 4,5
# Calculate significance of cluster separation for 4 clusters
k <- 4
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
sigclust_mat_k4 <- pairwise_cluster_significance(k=4, expr_mat=gexp_training_most_var, cluster_labels = clusts)
# Compute silhouette information for 4 clusters
silhouette_k4 <- silhouette(x=clusts, dist = dissimilarity)
# Calculate significance of cluster separation for 5 clusters
k <- 5
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
sigclust_mat_k5 <- pairwise_cluster_significance(k=5, expr_mat=gexp_training_most_var, cluster_labels = clusts)
silhouette_k5 <- silhouette(x=clusts, dist = dissimilarity)
# Summary plot sigclust results
#pdf(file.path(resultsDirectory, 'training_sigclust.pdf'))
par(mfrow=c(2,2))
image(x=1:3,y=1:3, sigclust_mat_k3, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='3 clusters',xlab='k',ylab='k')
image(x=1:4,y=1:4, sigclust_mat_k4, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='4 clusters',xlab='k',ylab='k')
image(x=1:5,y=1:5, sigclust_mat_k5, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='5 clusters',xlab='k',ylab='k')
image(matrix(data=seq(from=0,to=1,length.out=10),nrow=10,ncol=1),col=colorpanel(256,'blue','black','yellow'),main='Legend',axes=F,xlab='P-Value')
#axis(1)
#dev.off()
#pdf(file.path(resultsDirectory, 'training_silhouette.pdf'), width = 8, height = 11)
par(mfrow=c(3,1))
plot(silhouette_k3, col = RColorBrewer::brewer.pal(3, "Set1"), border=NA,
main="Silhouette plot of K=3", cex.names = 0.8)
plot(silhouette_k4, col = RColorBrewer::brewer.pal(4, "Set1"), border=NA,
main="Silhouette plot of K=4", cex.names = 0.8)
plot(silhouette_k5, col = RColorBrewer::brewer.pal(5, "Set1"), border=NA,
main="Silhouette plot of K=5", cex.names = 0.8)
#dev.off()
What is the optimal cluster solution? Choose k clusters according to diagnostic plots
num_clusters <- 3
clusts = results[[num_clusters]]$consensusClass[colnames(gexp_training_most_var)]
# Plot expression heatmap for training samples and clustering solution
most_var_genes <- names(mads[order(mads,decreasing=T)[1:300]])
meta_df <- as.data.frame(clusts)
meta_df$clusts <- as.factor(meta_df$clusts)
names(meta_df) <- c("Consensus Cluster")
pheatmap::pheatmap(gexp_training_most_var[most_var_genes, names(clusts[order(clusts)])], show_rownames=FALSE, show_colnames=FALSE,
annotation_col = meta_df,
cluster_rows=TRUE, cluster_cols=FALSE, scale="row",
color = colorRamps::matlab.like(25), annotation_names_col = F,
main="Top 300 Most Variable Genes\n Training Cohort",
filename = file.path(resultsDirectory, "heatmap_training_cohort_most_variable_expression_brca.pdf")
)
# Create centroids for clustering new samples
centroids = matrix(nrow=nrow(gexp_training_most_var),ncol=num_clusters,dimnames=list(rownames(gexp_training_most_var),c(1:num_clusters)))
for(clust in 1:num_clusters) {
tmp1 = names(which(clusts==clust))
centroids[,clust] = apply(gexp_training_most_var[,tmp1],1,median)
}
head(centroids)
## 1 2 3
## SCGB2A2 -0.4557219 -3.925706 2.750578
## HMGCS2 -0.8658018 -3.768695 2.448184
## CYP4Z1 -0.7187862 -3.031524 2.808021
## SCGB1D2 -0.2963639 -3.685027 2.155398
## CYP2B7P1 1.5957222 -6.126503 1.247740
## PIP -0.6098934 -3.255275 1.955698
To assign testing set samples to a learned cluster, we find the correlation between each sample and the cluster centroids
corr_mat_testing = cor(cbind(centroids, gexp_training_most_var),method='pearson', use= "pairwise.complete.obs")[-c(1:num_clusters),1:num_clusters]
# assign each sample (row) to the nearest centroid (i.e. the one with highest correlation)
clusts_testing= sapply(1:nrow(corr_mat_testing), function(x) { which(corr_mat_testing[x,]==max(corr_mat_testing[x,])) })
names(clusts_testing) = rownames(corr_mat_testing)
head(clusts_testing)
## TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.4H.AAAK TCGA.5L.AAT0
## 1 2 1 3 3
## TCGA.5T.A9QA
## 1
How well are the testing set samples represented in the clustering solution?
# Calculate pairwise significance of clusters for k clusters
sigclust_testing <- pairwise_cluster_significance(k = num_clusters, expr_mat = gexp_training_most_var, cluster_labels = clusts_testing)
# Compute silhouette information for clustering in 5 clusters
# correlation matrix for samples
corr_mat = cor(as.matrix(gexp_training_most_var), method = "pearson") # distance metric used for clustering
dissimilarity <- 1 - corr_mat
silhouette_testing <- silhouette(x=clusts_testing, dist = dissimilarity) # distance metric used in clustering)
# Plot significance of separability on hold-out test cohort
#pdf(file.path(resultsDirectory, 'testing_sigclust_silhouette_lung.pdf'))
par(mfrow=c(2,2))
image(x=1:num_clusters,y=1:num_clusters, sigclust_testing, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main=paste0(num_clusters, ' clusters'),xlab='k',ylab='k')
image(matrix(data=seq(from=0,to=1,length.out=100),nrow=10,ncol=1),col=colorpanel(256,'blue','black','yellow'),main='Legend',axes=F,xlab='P-Value')
axis(1)
plot(silhouette_testing, col = RColorBrewer::brewer.pal(num_clusters, "Set2"), main="Silhouette Plot of Testing Cohort", border=NA)
#dev.off()
# Write out the subtype definitions for TCGA discovery and testing cohorts
write.csv(cbind(names(clusts),clusts),row.names=F, file.path(resultsDirectory, 'tcga_brca_training_clusters.csv'))
write.csv(cbind(names(clusts_testing),clusts_testing),row.names=F, file.path(resultsDirectory, 'tcga_brca_testing_clusters.csv'))